library(patchwork)
library(readxl)
library(tidyverse)
library(naniar)
library(gridExtra)
library(CBPS)
library(survey)
library(polspline)
library(sjPlot)
library(mice)
library(margins)
library(butils)
library(emmeans)
library(lme4)
library(finalfit)
library(xtable)
library(boot)
library(tidyverse)
library(haven)
library(CBPS)
library(dineq)
library(survey)
library(naniar)
library(RSQLite)
library(gridExtra)
library(MatchIt)
library(sjPlot)
library(sl3)
library(gbm)
library(e1071)
library(entropy)
library(xtable)
library(mice)
library(cowplot)
library(mipfp)
library(dineq)
library(IC2)

setwd("Your Working Directory")

load("psid_imputed.rDa")


####################Functions####################
case_comparison <- function(obs,com){
  
  # Turn to a data frame
  d <- merge(as.data.frame(obs),as.data.frame(com),by=c("Var1","Var2"))
  
  # Fit the model
  mod <- glm( Freq.x ~ Var1 + Var2 + offset(log(Freq.y)), data=d, family=poisson("log") )
  
  # Get the fitted values
  d$Freq <- predict(mod, type="response")  
  
  return(xtabs(Freq ~ Var1 + Var2, data = d))
}




intervention.delta <- function(intervention,observed){
  #adding a small value to avoid 0  
  intervention <- intervention + 0.1
  observed <- observed + 0.1
  
  mat.int <- matrix(nrow=nrow(intervention),ncol=ncol(intervention))
  for (i in 1:nrow(intervention)) {
    for (j in 1:ncol(intervention)) {
      mat.int[i,j] <- intervention[i,j]/intervention[i,1]
    }
  }
  
  mat.obs <- matrix(nrow=nrow(observed),ncol=ncol(observed))
  for (i in 1:nrow(observed)) {
    for (j in 1:ncol(observed)) {
      mat.obs[i,j] <- observed[i,j]/observed[i,1]
    }
  }
  
  return(mat.int/mat.obs)
}


pips <- function(origin,pscores,delta){
  
  temp <- matrix(nrow=nrow(pscores),ncol=ncol(pscores))
  
  for (i in 1:length(unique(origin))) {
    temp[origin==i,] <- t(t(pscores[origin==i,]) * delta[i,]) / as.vector((pscores[origin==i,] %*% matrix(delta[i,])))
  }
  
  return(temp)
}


controls <- c("ER32000","ER70882","ER23013","ER26994","ER68589","ER68590","ER68571","ER68591","ER71534","ER70876","ER70879","ER70880")

psid_imputed$loginc <- log(psid_imputed$income)



############################################################





####Mobility specification

####Estimation: Mean, Propensity, Second Moment

#Propensity
pscore <- multinom(edu ~ ., psid_imputed %>% dplyr::select(edu,controls))
psid_imputed[,c("prob1","prob2","prob3")] <- predict(pscore,newdata = psid_imputed %>% dplyr::select(edu,controls),"probs")

obs.dat <- as.data.frame(table(psid_imputed$faedu,psid_imputed$edu))
obs.dat$ind <- predict(glm(Freq ~ Var1 + Var2, data = obs.dat, family = poisson("log")),type="response")

obs <- xtabs(Freq ~ Var1 + Var2, obs.dat)
ind <- xtabs(ind ~ Var1 + Var2, obs.dat)

psid_imputed[,c("pips.ind1","pips.ind2","pips.ind3")] <- pips(origin=psid_imputed$faedu,pscores = as.matrix(psid_imputed[,c("prob1","prob2","prob3")]),delta = intervention.delta(ind,obs))


#Mean
psid_imputed$loginc2 <- (psid_imputed$loginc)^2


set.seed(08540)

income_task <- make_sl3_Task(
  data = psid_imputed,
  covariates = c(controls,"edu"),
  outcome = "loginc",
  outcome_type = "continuous")

sl3_list_learners("continuous") 


sl <- make_learner("Lrnr_glm")

sl_fit <- sl$train(income_task)


psid_imputed1 <- psid_imputed %>% dplyr::select(-edu) %>% mutate(edu = as.factor(rep(1,nrow(psid_imputed))),.keep = "all")
psid_imputed2 <- psid_imputed %>% dplyr::select(-edu) %>% mutate(edu = as.factor(rep(2,nrow(psid_imputed))),.keep = "all")
psid_imputed3 <- psid_imputed %>% dplyr::select(-edu) %>% mutate(edu = as.factor(rep(3,nrow(psid_imputed))),.keep = "all")
psid_imputed1$edu <- factor(psid_imputed1$edu, levels = c(1,2,3))
psid_imputed2$edu <- factor(psid_imputed2$edu, levels = c(1,2,3))
psid_imputed3$edu <- factor(psid_imputed3$edu, levels = c(1,2,3))

psid_imputed$pred <- sl_fit$predict(make_sl3_Task(
  data = psid_imputed,
  covariates = c(controls,"edu"),
  outcome = "loginc",
  outcome_type = "continuous"))

psid_imputed$pred1 <- sl_fit$predict(make_sl3_Task(
  data = psid_imputed1,
  covariates = c(controls,"edu"),
  outcome = "loginc",
  outcome_type = "continuous"))

psid_imputed$pred2 <- sl_fit$predict(make_sl3_Task(
  data = psid_imputed2,
  covariates = c(controls,"edu"),
  outcome = "loginc",
  outcome_type = "continuous"))

psid_imputed$pred3 <- sl_fit$predict(make_sl3_Task(
  data = psid_imputed3,
  covariates = c(controls,"edu"),
  outcome = "loginc",
  outcome_type = "continuous"))









#Second Moment
set.seed(08540)


income2_task <- make_sl3_Task(
  data = psid_imputed,
  covariates = c(controls,"edu"),
  outcome = "loginc2",
  outcome_type = "continuous")


sl2_fit <- sl$train(income2_task)


psid_imputed$pred2 <- sl_fit$predict(make_sl3_Task(
  data = psid_imputed,
  covariates = c(controls,"edu"),
  outcome = "loginc2",
  outcome_type = "continuous"))

psid_imputed$pred21 <- sl_fit$predict(make_sl3_Task(
  data = psid_imputed1,
  covariates = c(controls,"edu"),
  outcome = "loginc2",
  outcome_type = "continuous"))

psid_imputed$pred22 <- sl_fit$predict(make_sl3_Task(
  data = psid_imputed2,
  covariates = c(controls,"edu"),
  outcome = "loginc2",
  outcome_type = "continuous"))

psid_imputed$pred23 <- sl_fit$predict(make_sl3_Task(
  data = psid_imputed3,
  covariates = c(controls,"edu"),
  outcome = "loginc2",
  outcome_type = "continuous"))

#Expected realized outcome

psid_imputed[,"ero"] <- apply(psid_imputed[,c("prob1","prob2","prob3")] *
                                psid_imputed[,c("pred1","pred2","pred3")],1,sum)
psid_imputed[,"piero.ind"] <- apply(psid_imputed[,c("pips.ind1","pips.ind2","pips.ind3")] *
                                      psid_imputed[,c("pred1","pred2","pred3")],1,sum)


####Estimator for the mean and group mean
mean(psid_imputed$piero.ind)
mean(psid_imputed$ero)

mean(as.matrix(psid_imputed[psid_imputed$faedu==1,"piero.ind"]))
mean(as.matrix(psid_imputed[psid_imputed$faedu==1,"ero"]))

mean(as.matrix(psid_imputed[psid_imputed$faedu==2,"piero.ind"]))
mean(as.matrix(psid_imputed[psid_imputed$faedu==2,"ero"]))

mean(as.matrix(psid_imputed[psid_imputed$faedu==3,"piero.ind"]))
mean(as.matrix(psid_imputed[psid_imputed$faedu==3,"ero"]))

####Construct MIEV Estimator
psid_imputed <- psid_imputed %>% mutate(mie_in_variance = (pips.ind1 - prob1)*pred21 +
                                          (pips.ind2 - prob2)*pred22 +
                                          (pips.ind3 - prob3)*pred23 -
                                          (piero.ind^2 - ero^2))

#Total mie on variance equals mie on in-group variance plus mie on between-group variance
mean(psid_imputed$mie_in_variance) + (var(psid_imputed$piero.ind)-var(psid_imputed$ero))
var(psid_imputed$loginc)


#This reproduces Figure 5 of the article

teh <- glm(loginc ~ as.factor(edu):as.factor(faedu) +ER32000+ER70882+ER23013+ER26994+ER68589+ER68590+ER68571+ER68591+ER71534+ER70876+ER70879+ER70880, data = psid_imputed)
teh_con <- contrast(emmeans(teh, ~ edu * faedu, rg.limit = 50000), "revpairwise", by = "faedu", adjust = "none")

teh <- as.data.frame(cbind(c(1,1,1,2,2,2),
                           c(1,2,3,1,2,3),
                           c(as.numeric(as.data.frame(teh_con[1])[3]),
                             as.numeric(as.data.frame(teh_con[4])[3]),
                             as.numeric(as.data.frame(teh_con[7])[3]),
                             as.numeric(as.data.frame(teh_con[3])[3]),
                             as.numeric(as.data.frame(teh_con[6])[3]),
                             as.numeric(as.data.frame(teh_con[9])[3])),
                           c(as.numeric(as.data.frame(teh_con[1])[4]),
                             as.numeric(as.data.frame(teh_con[4])[4]),
                             as.numeric(as.data.frame(teh_con[7])[4]),
                             as.numeric(as.data.frame(teh_con[3])[4]),
                             as.numeric(as.data.frame(teh_con[6])[4]),
                             as.numeric(as.data.frame(teh_con[9])[4]))
))

colnames(teh) <- c("degree","padeg","mean","se")


ggplot(data = teh,
       aes(x=as.factor(padeg),y=as.numeric(mean),shape=as.factor(degree)))+
  geom_point(size=5,position=position_dodge(width=0.4))+
  geom_errorbar(width=0.2,position=position_dodge(width=0.4),aes(ymin=as.numeric(mean)-1.96*as.numeric(se),ymax=as.numeric(mean)+1.96*as.numeric(se)))+
  scale_shape_discrete(breaks=c(1,2),label=c("HS vs Lower than HS","College vs HS"))+
  scale_x_discrete(breaks=c(1,2,3),labels=c("Less than HS","HS","College"))+
  labs(x="Father Education",y="Education Effect on Logged Income",shape="Education Effect")+
  theme_bw()+
  theme(text = element_text(size=20))





####Residual Imputation

psid_imputed$resid <- psid_imputed$income - exp(psid_imputed$pred)
psid_imputed[psid_imputed$edu==1,"resid1"] <- psid_imputed[psid_imputed$edu==1,"resid"]
psid_imputed[psid_imputed$edu==2,"resid2"] <- psid_imputed[psid_imputed$edu==2,"resid"]
psid_imputed[psid_imputed$edu==3,"resid3"] <- psid_imputed[psid_imputed$edu==3,"resid"]

psid_imputed <- psid_imputed %>% group_by(edu) %>% mutate(percentile = round(rank(resid)/length(resid),3))
psid_imputed <- psid_imputed %>% group_by(percentile) %>% 
  mutate(residual1 = mean(resid1,na.rm=T),
         residual2 = mean(resid2,na.rm=T),
         residual3 = mean(resid3,na.rm=T))

psid_imputed["predimput1"] <- exp(psid_imputed[,"pred1"]) + psid_imputed[,"residual1"]
psid_imputed["predimput2"] <- exp(psid_imputed[,"pred2"]) + psid_imputed[,"residual2"]
psid_imputed["predimput3"] <- exp(psid_imputed[,"pred3"]) + psid_imputed[,"residual3"]

psid_imputed[,"piero.ind.imp"] <- apply(psid_imputed[,c("pips.ind1","pips.ind2","pips.ind3")] *
                                          psid_imputed[,c("predimput1","predimput2","predimput3")],1,sum)


psid_imputed <- psid_imputed %>% filter(piero.ind.imp>0)

#Density Plot
res1 <- ggplot(psid_imputed %>% dplyr::select(income,piero.ind.imp) %>% pivot_longer(!percentile),
               aes(x=log(value)))+
  geom_density(aes(linetype=name))+
  scale_x_continuous(limits = c(3,15))+
  scale_y_continuous(limits = c(0,0.6))+
  labs(x="Logged Income",y="Probability Density",title="Total Population",linetype="")+
  scale_linetype_discrete(breaks=c("income","piero.ind.imp"),labels=c("Observed","Post-Intervention"))+
  theme_bw()+
  theme(text = element_text(size=25))

#Conditional Density Plot
res2 <- ggplot(psid_imputed %>% filter(faedu ==1) %>% dplyr::select(income,piero.ind.imp)  %>% pivot_longer(!percentile),
               aes(x=log(value)))+
  geom_density(aes(linetype=name))+
  scale_x_continuous(limits = c(3,15))+
  scale_y_continuous(limits = c(0,0.6))+
  labs(x="Logged Income",y="Probability Density",title="Father Lower than HS",linetype="")+
  scale_linetype_discrete(breaks=c("income","piero.ind.imp"),labels=c("Observed","Post-Intervention"))+
  theme_bw()+
  theme(text = element_text(size=25))

res3 <- ggplot(psid_imputed %>% filter(faedu ==2) %>% dplyr::select(income,piero.ind.imp)  %>% pivot_longer(!percentile),
               aes(x=log(value)))+
  geom_density(aes(linetype=name))+
  scale_x_continuous(limits = c(3,15))+
  scale_y_continuous(limits = c(0,0.6))+
  labs(x="Logged Income",y="Probability Density",title="Father HS",linetype="")+
  scale_linetype_discrete(breaks=c("income","piero.ind.imp"),labels=c("Observed","Post-Intervention"))+
  theme_bw()+
  theme(text = element_text(size=25))

res4 <- ggplot(psid_imputed %>% filter(faedu ==3) %>% dplyr::select(income,piero.ind.imp)  %>% pivot_longer(!percentile),
               aes(x=log(value)))+
  geom_density(aes(linetype=name))+
  scale_x_continuous(limits = c(3,15))+
  scale_y_continuous(limits = c(0,0.6))+
  labs(x="Logged Income",y="Probability Density",title="Father College",linetype="")+
  scale_linetype_discrete(breaks=c("income","piero.ind.imp"),labels=c("Observed","Post-Intervention"))+
  theme_bw()+
  theme(text = element_text(size=25))




#####This Reproduces Figure 6 of the Original Article
res1+res2+res3+res4+plot_layout(guides = "collect") & theme(legend.position = "bottom")





#Function: Step-Wise OR (Code work for 3-category OD only, ten steps, 0 means total independence, 10 means observed data)
stepor <- function(table){
  
  d <- as.data.frame(table)
  # Fit the model
  mod <- glm( Freq ~ Var1 + Var2 + Var1*Var2, data=d, family=poisson("log") )
  
  for (i in 0:10) {
    d[,paste0("asso",i)] <- 0
    d[c(5:6,8:9),paste0("asso",i)] <- 0.1*i* coef(mod)[6:9] 
    d[,paste0("pred",i)] <- predict(glm(as.formula(paste0("Freq ~ Var1 + Var2 + offset(",paste0("asso",i),")")), data=d, family=poisson("log") ), type="response")  
  }
  
  return(d)
}

stepdat <- stepor(table(psid_imputed$faedu,psid_imputed$edu))
for (i in 0:10) {
  assign(paste0("step.",i) , xtabs(as.formula(paste0(paste0("pred",i),"~ Var1 + Var2")),stepdat) )
}
for (i in 0:10) {
  psid_imputed[,c(paste0("pips.step",i,".1"),paste0("pips.step",i,".2"),paste0("pips.step",i,".3"))] <- pips(origin=psid_imputed$faedu,pscores = as.matrix(psid_imputed[,c("prob1","prob2","prob3")]),delta = intervention.delta(eval(as.name(paste0("step.",i))),obs))
}
for (i in 0:10) {
  psid_imputed[,paste0("piero.step.imp",i)] <- apply(psid_imputed[,c(paste0("pips.step",i,".1"),paste0("pips.step",i,".2"),paste0("pips.step",i,".3"))] *
                                                       psid_imputed[,c("predimput1","predimput2","predimput3")],1,sum)
  psid_imputed <- psid_imputed %>% filter(piero.ind.imp>0)
}

steps.dat <- matrix(nrow = 11, ncol = 4)

for (i in 0:10) {
  
  steps.dat[i+1,] <- c(
    #imputed logged variance,
    as.numeric(
      var(log(psid_imputed[,paste0("piero.step.imp",i)]),na.rm=T)
    ),
    
    #90/10 ratio
    as.numeric(
      sort(as.matrix(psid_imputed[!is.na(psid_imputed[,paste0("piero.step.imp",i)]),paste0("piero.step.imp",i)]))[round(0.9*nrow(psid_imputed %>% drop_na(paste0("piero.step.imp",i))))] / 
        sort(as.matrix(psid_imputed[!is.na(psid_imputed[,paste0("piero.step.imp",i)]),paste0("piero.step.imp",i)]))[round(0.1*nrow(psid_imputed %>% drop_na(paste0("piero.step.imp",i))))]
    ),
    
    #80/20 ratio
    as.numeric(
      sort(as.matrix(psid_imputed[!is.na(psid_imputed[,paste0("piero.step.imp",i)]),paste0("piero.step.imp",i)]))[round(0.8*nrow(psid_imputed %>% drop_na(paste0("piero.step.imp",i))))] / 
        sort(as.matrix(psid_imputed[!is.na(psid_imputed[,paste0("piero.step.imp",i)]),paste0("piero.step.imp",i)]))[round(0.2*nrow(psid_imputed %>% drop_na(paste0("piero.step.imp",i))))] 
    ),
    
    #Gini
    as.numeric(gini.wtd(pull(psid_imputed,paste0("piero.step.imp",i))))
  )
}


#############Bootstrap OLS predictor, storing pre- and post-intervention mean, conditional mean, variance, 90/10 ratio, and 80/20 ratio
#############Estimating conditional variance using the residual-based instead of second-moment based estimator
#############Second-moment based estimation produces negative variance estimates? Residual-based produces overly-large estimates and insignificant mie?

bootstat <- function(dat,i){
  
  data <- dat[i,]
  
  #Propensity
  pscore <- multinom(edu ~ ., data %>% dplyr::select(edu,controls))
  data[,c("prob1","prob2","prob3")] <- predict(pscore,newdata = data %>% dplyr::select(edu,controls),"probs")
  
  obs.dat <- as.data.frame(table(data$faedu,data$edu))
  obs.dat$ind <- predict(glm(Freq ~ Var1 + Var2, data = obs.dat, family = poisson("log")),type="response")
  
  obs <- xtabs(Freq ~ Var1 + Var2, obs.dat)
  ind <- xtabs(ind ~ Var1 + Var2, obs.dat)
  
  data[,c("pips.ind1","pips.ind2","pips.ind3")] <- pips(origin=data$faedu,pscores = as.matrix(data[,c("prob1","prob2","prob3")]),delta = intervention.delta(ind,obs))
  
  
  #Mean
  data$loginc2 <- (data$loginc)^2
  
  
  set.seed(08540)
  
  income_task <- make_sl3_Task(
    data = data,
    covariates = c(controls,"edu"),
    outcome = "loginc",
    outcome_type = "continuous")
  
  
  sl <- make_learner("Lrnr_glm")
  
  sl_fit <- sl$train(income_task)
  
  
  data1 <- data %>% dplyr::select(-edu) %>% mutate(edu = as.factor(rep(1)),.keep = "all")
  data2 <- data %>% dplyr::select(-edu) %>% mutate(edu = as.factor(rep(2)),.keep = "all")
  data3 <- data %>% dplyr::select(-edu) %>% mutate(edu = as.factor(rep(3)),.keep = "all")
  data1$edu <- factor(data1$edu, levels = c(1,2,3))
  data2$edu <- factor(data2$edu, levels = c(1,2,3))
  data3$edu <- factor(data3$edu, levels = c(1,2,3))
  
  data$pred <- sl_fit$predict(make_sl3_Task(
    data = data,
    covariates = c(controls,"edu"),
    outcome = "loginc",
    outcome_type = "continuous"))
  
  data$pred1 <- sl_fit$predict(make_sl3_Task(
    data = data1,
    covariates = c(controls,"edu"),
    outcome = "loginc",
    outcome_type = "continuous"))
  
  data$pred2 <- sl_fit$predict(make_sl3_Task(
    data = data2,
    covariates = c(controls,"edu"),
    outcome = "loginc",
    outcome_type = "continuous"))
  
  data$pred3 <- sl_fit$predict(make_sl3_Task(
    data = data3,
    covariates = c(controls,"edu"),
    outcome = "loginc",
    outcome_type = "continuous"))
  
  
  
  #Expected realized outcome
  
  data[,"ero"] <- apply(data[,c("prob1","prob2","prob3")] *
                          data[,c("pred1","pred2","pred3")],1,sum)
  data[,"piero.ind"] <- apply(data[,c("pips.ind1","pips.ind2","pips.ind3")] *
                                data[,c("pred1","pred2","pred3")],1,sum)
  
  ####Residual Imputation
  
  data$resid <- data$income - exp(data$pred)
  data[data$edu==1,"resid1"] <- data[data$edu==1,"resid"]
  data[data$edu==2,"resid2"] <- data[data$edu==2,"resid"]
  data[data$edu==3,"resid3"] <- data[data$edu==3,"resid"]
  
  data <- data %>% group_by(edu) %>% mutate(percentile = round(rank(resid)/length(resid),3))
  data <- data %>% group_by(percentile) %>% 
    mutate(residual1 = mean(resid1,na.rm=T),
           residual2 = mean(resid2,na.rm=T),
           residual3 = mean(resid3,na.rm=T))
  
  data["predimput1"] <- exp(data[,"pred1"]) + data[,"residual1"]
  data["predimput2"] <- exp(data[,"pred2"]) + data[,"residual2"]
  data["predimput3"] <- exp(data[,"pred3"]) + data[,"residual3"]
  
  data[,"piero.ind.imp"] <- apply(data[,c("pips.ind1","pips.ind2","pips.ind3")] *
                                    data[,c("predimput1","predimput2","predimput3")],1,sum)
  data <- data %>% filter(piero.ind.imp>0)
  
  #Variance (Squared Residual) Estimation
  #  data$residsquared <- (data$resid)^2
  
  #  set.seed(08540)
  
  #  residsquared_task <- make_sl3_Task(
  #    data = data,
  #  covariates = c(controls,"edu"),
  # outcome = "residsquared",
  # outcome_type = "continuous")
  
  
  #  stack <- make_learner_stack(
  #  "Lrnr_polspline"
  #)
  
  #meta <- make_learner("Lrnr_glm")
  
  # sl <- make_learner(Lrnr_sl, 
  #                     learners = stack,
  #                  metalearner = meta)
  
  #sl2_fit <- sl$train(residsquared_task)
  
  data1 <- data %>% dplyr::select(-edu) %>% mutate(edu = as.factor(rep(1)),.keep = "all")
  data2 <- data %>% dplyr::select(-edu) %>% mutate(edu = as.factor(rep(2)),.keep = "all")
  data3 <- data %>% dplyr::select(-edu) %>% mutate(edu = as.factor(rep(3)),.keep = "all")
  data1$edu <- factor(data1$edu, levels = c(1,2,3))
  data2$edu <- factor(data2$edu, levels = c(1,2,3))
  data3$edu <- factor(data3$edu, levels = c(1,2,3))
  
  
  # data$predresidsquared <- sl_fit$predict(make_sl3_Task(
  #     data = data,
  #     covariates = c(controls,"edu"),
  #     outcome = "residsquared",
  #     outcome_type = "continuous"))
  
  #  data$predresidsquared1 <- sl_fit$predict(make_sl3_Task(
  #     data = data1,
  #     covariates = c(controls,"edu"),
  #     outcome = "residsquared",
  #     outcome_type = "continuous"))
  
  #   data$predresidsquared2 <- sl_fit$predict(make_sl3_Task(
  #     data = data2,
  #     covariates = c(controls,"edu"),
  #     outcome = "residsquared",
  #     outcome_type = "continuous"))
  
  #   data$predresidsquared3 <- sl_fit$predict(make_sl3_Task(
  #     data = data3,
  #     covariates = c(controls,"edu"),
  #     outcome = "residsquared",
  #     outcome_type = "continuous"))
  
  #Second Moment Estimation
  set.seed(08540)
  
  income2_task <- make_sl3_Task(
    data = data,
    covariates = c(controls,"edu"),
    outcome = "loginc2",
    outcome_type = "continuous")
  
  sl2_fit <- sl$train(income2_task)
  
  data$pred20 <- sl2_fit$predict(make_sl3_Task(
    data = data,
    covariates = c(controls,"edu"),
    outcome = "loginc2",
    outcome_type = "continuous"))
  
  data$pred21 <- sl2_fit$predict(make_sl3_Task(
    data = data1,
    covariates = c(controls,"edu"),
    outcome = "loginc2",
    outcome_type = "continuous"))
  
  data$pred22 <- sl2_fit$predict(make_sl3_Task(
    data = data2,
    covariates = c(controls,"edu"),
    outcome = "loginc2",
    outcome_type = "continuous"))
  
  data$pred23 <- sl2_fit$predict(make_sl3_Task(
    data = data3,
    covariates = c(controls,"edu"),
    outcome = "loginc2",
    outcome_type = "continuous"))
  
  
  
  ###
  
  
  return(
    c( ####Estimator for the mean and group mean
      mean(data$ero),
      mean(data$piero.ind),
      
      mean(as.matrix(data[data$faedu==1,"ero"])),
      mean(as.matrix(data[data$faedu==1,"piero.ind"])),
      
      mean(as.matrix(data[data$faedu==2,"ero"])),
      mean(as.matrix(data[data$faedu==2,"piero.ind"])),
      
      mean(as.matrix(data[data$faedu==3,"ero"])),
      mean(as.matrix(data[data$faedu==3,"piero.ind"])),
      
      ####Residual estimators for variance
      #mean(data$pips.ind1*(data$predresidsquared1+(data$pred1)^2) + data$pips.ind2*(data$predresidsquared2+(data$pred2)^2) + data$pips.ind3*(data$predresidsquared3+(data$pred3)^2) - data$piero.ind^2) + var(data$piero.ind),
      #mean(data$prob1*(data$predresidsquared1+(data$pred1)^2) + data$prob2*(data$predresidsquared2+(data$pred2)^2) + data$prob3*(data$predresidsquared3+(data$pred3)^2) - data$ero^2) + var(data$ero),
      
      
      ####Second moment estimator for variance
      #mean(data$prob1*data$pred21 + data$prob2*data$pred22 + data$prob3*data$pred23 - data$ero^2) + var(psid_imputed$ero),
      #mean(data$pips.ind1*data$pred21 + data$pips.ind2*data$pred22 + data$pips.ind3*data$pred23 - data$piero.ind^2) + var(psid_imputed$piero.ind),
      
      #imputed logged variance,  9,10
      var(data$loginc),
      var(log(data$piero.ind.imp)),
      
      #90/10 ratio
      sort(data$income)[round(0.9*nrow(data))] / 
        sort(data$income)[round(0.1*nrow(data))],
      
      sort(as.matrix(data[!is.na(data$piero.ind.imp),"piero.ind.imp"]))[round(0.9*nrow(data %>% drop_na(piero.ind.imp)))] / 
        sort(as.matrix(data[!is.na(data$piero.ind.imp),"piero.ind.imp"]))[round(0.1*nrow(data %>% drop_na(piero.ind.imp)))] ,
      
      #80/20 ratio
      sort(data$income)[round(0.8*nrow(data))] / 
        sort(data$income)[round(0.2*nrow(data))],
      
      sort(as.matrix(data[!is.na(data$piero.ind.imp),"piero.ind.imp"]))[round(0.8*nrow(data %>% drop_na(piero.ind.imp)))] / 
        sort(as.matrix(data[!is.na(data$piero.ind.imp),"piero.ind.imp"]))[round(0.2*nrow(data %>% drop_na(piero.ind.imp)))] ,
      
      #Gini
      gini.wtd(data$income),
      gini.wtd(data$piero.ind.imp),
      
      
      #CATEs
      mean(pull(data[data$faedu==1,],"pred2"))-mean(pull(data[data$faedu==1,],"pred1")),
      mean(pull(data[data$faedu==2,],"pred2"))-mean(pull(data[data$faedu==2,],"pred1")),
      mean(pull(data[data$faedu==3,],"pred2"))-mean(pull(data[data$faedu==3,],"pred1")),
      mean(pull(data[data$faedu==1,],"pred3"))-mean(pull(data[data$faedu==1,],"pred2")),
      mean(pull(data[data$faedu==2,],"pred3"))-mean(pull(data[data$faedu==2,],"pred2")),
      mean(pull(data[data$faedu==3,],"pred3"))-mean(pull(data[data$faedu==3,],"pred2"))
      
    )
    
  )
  
  
}


boot_result <- bootPb(data=psid_imputed,statistic=bootstat,R=500)


boot_sd <- apply(boot_result[["t"]],2,sd)
means <- as.data.frame(cbind(boot_result[["t0"]][1:8],boot_sd[1:8],rep(1:2,4),c(1,1,2,2,3,3,4,4)))
colnames(means) <- c("value","sd","prepost","type")
means$type <- factor(means$type,levels=c(1,2,3,4),labels=c("Total","Lower than HS Origin","HS Origin","College Origin"))



##This Reproduces Figure 7


miev <- as.data.frame(cbind(boot_result[["t0"]][9:10],boot_sd[9:10],c(1:2)))
colnames(miev) <- c("value","sd","prepost")

nineten <- as.data.frame(cbind(boot_result[["t0"]][11:12],boot_sd[11:12],c(1:2)))
colnames(nineten) <- c("value","sd","prepost")

eighttwo <- as.data.frame(cbind(boot_result[["t0"]][13:14],boot_sd[13:14],c(1:2)))
colnames(eighttwo) <- c("value","sd","prepost")

gini <- as.data.frame(cbind(boot_result[["t0"]][15:16],boot_sd[15:16],c(1:2)))
colnames(gini) <- c("value","sd","prepost")



grid.arrange(
  ggplot(miev,aes(x=prepost,y=value))+
    geom_point()+
    scale_x_continuous(breaks = c(1,2),labels = c("Before","After"))+
    geom_errorbar(width=0.2,aes(ymin=value-1.96*as.numeric(sd),ymax=value+1.96*as.numeric(sd)))+
    theme_bw()+
    theme(text = element_text(size=23),
          axis.title.x = element_blank())+
    labs(y = "Variance of Logged Income"),
  
  ggplot(gini,aes(x=prepost,y=value))+
    geom_point()+
    scale_x_continuous(breaks = c(1,2),labels = c("Before","After"))+
    geom_errorbar(width=0.2,aes(ymin=value-1.96*as.numeric(sd),ymax=value+1.96*as.numeric(sd)))+
    theme_bw()+
    theme(text = element_text(size=23),
          axis.title.x = element_blank())+
    labs(y = "Gini Coefficient"),
  
  ggplot(eighttwo,aes(x=prepost,y=value))+
    geom_point()+
    scale_x_continuous(breaks = c(1,2),labels = c("Before","After"))+
    geom_errorbar(width=0.2,aes(ymin=value-1.96*as.numeric(sd),ymax=value+1.96*as.numeric(sd)))+
    theme_bw()+
    theme(text = element_text(size=23),
          axis.title.x = element_blank())+
    labs(y = "80/20 Ratio of Income"),
  
  ggplot(nineten,aes(x=prepost,y=value))+
    geom_point()+
    scale_x_continuous(breaks = c(1,2),labels = c("Before","After"))+
    geom_errorbar(width=0.2,aes(ymin=value-1.96*as.numeric(sd),ymax=value+1.96*as.numeric(sd)))+
    theme_bw()+
    theme(text = element_text(size=23),
          axis.title.x = element_blank())+
    labs(y = "90/10 Ratio of Income")
  
)





#####This reproduces Figure 8 of the article
stepdraw <- as.data.frame(cbind(c(10:0),
                                steps.dat,
                                rep(boot_sd[10],11),
                                rep(boot_sd[12],11),
                                rep(boot_sd[14],11),
                                rep(boot_sd[16],11)))
colnames(stepdraw) <- c("step","var","nineten","eighttwo","gini","varse","ninetense","eighttwose","ginise")


grid.arrange(
  ggplot(stepdraw,aes(x=step,y=var))+
    geom_point()+
    geom_errorbar(aes(ymin=var-1.96*varse,ymax=var+1.96*varse))+
    scale_x_continuous(breaks = c(0,2,4,6,8,10),labels = c("0","20%","40%","60%","80%","100%"))+
    labs(x="Intervention Strength",y="Variance of Logged Income",col="Education Effect")+
    theme_bw()+
    theme(axis.text = element_text(size=20),
          axis.title = element_text(size=20)),
  
  ggplot(stepdraw,aes(x=step,y=nineten))+
    geom_point()+
    geom_errorbar(aes(ymin=nineten-1.96*ninetense,ymax=nineten+1.96*ninetense))+
    scale_x_continuous(breaks = c(0,2,4,6,8,10),labels = c("0","20%","40%","60%","80%","100%"))+
    labs(x="Intervention Strength",y="90/10 Ratio",col="Education Effect")+
    theme_bw()+
    theme(axis.text = element_text(size=20),
          axis.title = element_text(size=20)),
  
  ggplot(stepdraw,aes(x=step,y=eighttwo))+
    geom_point()+
    geom_errorbar(aes(ymin=eighttwo-1.96*eighttwose,ymax=eighttwo+1.96*eighttwose))+
    scale_x_continuous(breaks = c(0,2,4,6,8,10),labels = c("0","20%","40%","60%","80%","100%"))+
    labs(x="Intervention Strength",y="80/20 Ratio",col="Education Effect")+
    theme_bw()+
    theme(axis.text = element_text(size=20),
          axis.title = element_text(size=20)),
  
  ggplot(stepdraw,aes(x=step,y=gini))+
    geom_point()+
    geom_errorbar(aes(ymin=gini-1.96*ginise,ymax=gini+1.96*ginise))+
    scale_x_continuous(breaks = c(0,2,4,6,8,10),labels = c("0","20%","40%","60%","80%","100%"))+
    labs(x="Intervention Strength",y="Gini Coefficient",col="Education Effect")+
    theme_bw()+
    theme(axis.text = element_text(size=20),
          axis.title = element_text(size=20))
  
)
